In [1]:
import graphlab
import os
In [2]:
URL = 'https://d396qusza40orc.cloudfront.net/phoenixassets/amazon_baby.csv'
In [3]:
def get_data(filename='amazon_baby.csv', url=URL, force_download=False):
"""Download and cache the amazon data
Parameters
----------
filename: string (optional)
location to save the data
url: string (optional)
force_download: bool (optional)
if True, force redownload of data
Returns
-------
data: graphlab SFrame. Similer to a pandas DataFrame,
but with capacity for faster analysis of larger data sets
"""
if force_download or not os.path.exists(filename):
urlretrieve(url, filename)
sf = graphlab.SFrame('amazon_baby.csv')
return sf
products = get_data()
products.head()
Out[3]:
In [4]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
products.head()
Out[4]:
In [5]:
# Set graphs to appear within the notebook
graphlab.canvas.set_target('ipynb')
# Look at products with the most reviews
products['name'].show()
In [6]:
giraffe_reviews = products[products['name']=='Vulli Sophie the Giraffe Teether']
len(giraffe_reviews)
Out[6]:
In [7]:
giraffe_reviews['rating'].show(view='Categorical')
In [8]:
products['rating'].show(view='Categorical')
In [9]:
# I consider 3 star ratings neutral, so they will be removed from the dataset
products = products[products['rating'] != 3]
In [10]:
# if rating is 4 or 5, the review is considered positive (1), else negative (0)
products['sentiment'] = products['rating'] >= 4
products.head()
Out[10]:
In [11]:
# randomly split data set into training data and test data
train_data, test_data = products.random_split(.8, seed=0)
print len(train_data)
print len(test_data)
In [12]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=['word_count'],
validation_set=test_data)
In [13]:
# evaluate the model using an roc curve
sentiment_model.evaluate(test_data, metric='roc_curve')
Out[13]:
In [14]:
sentiment_model.show(view='Evaluation')
In [15]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')
giraffe_reviews.head()
Out[15]:
In [16]:
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)
giraffe_reviews.head()
Out[16]:
In [17]:
# review with highest predicted sentiment
giraffe_reviews[0]['review']
Out[17]:
In [18]:
giraffe_reviews[1]['review']
Out[18]:
In [19]:
# review with lowest predicted sentiment
giraffe_reviews[-1]['review']
Out[19]:
In [20]:
giraffe_reviews[(giraffe_reviews['rating']>=4) & (giraffe_reviews['predicted_sentiment'] < 0.5)]
Out[20]:
In [21]:
giraffe_reviews[(giraffe_reviews['rating']>=4) & (giraffe_reviews['predicted_sentiment'] < 0.5)][0]['review']
Out[21]:
In [22]:
giraffe_reviews[(giraffe_reviews['rating']<=2) & (giraffe_reviews['predicted_sentiment'] > 0.5)]
Out[22]:
In [23]:
giraffe_reviews[(giraffe_reviews['rating']<=2) & (giraffe_reviews['predicted_sentiment'] > 0.5)][0]['review']
Out[23]:
In [ ]: